rm(list=ls())
set.seed(123)
# Data Manipulation
library(tidyverse)  # Comprehensive data manipulation and visualization
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)      # Data manipulation (part of tidyverse)
library(stringr)    # String manipulation
library(reshape2)   # Data reshaping and melting
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
# Visualization
library(ggplot2)    # Data visualization
library(corrplot)   # Correlation matrix visualization
## corrplot 0.92 loaded
library(ggrepel)    # Improved text label placement
library(hrbrthemes) # Ggplot themes
library(viridis)    # Color scales for ggplot2
## Loading required package: viridisLite
# Statistical Analysis and Clustering
library(factoextra) # PCA and clustering visualization
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)    # Clustering algorithms
library(mclust)     # Model-based clustering
## Package 'mclust' version 6.1.1
## Type 'citation("mclust")' for citing this R package in publications.
## 
## Attaching package: 'mclust'
## 
## The following object is masked from 'package:purrr':
## 
##     map
library(nFactors)   # Determining number of factors
## Loading required package: lattice
## 
## Attaching package: 'nFactors'
## 
## The following object is masked from 'package:lattice':
## 
##     parallel
library(insight)    # Extract and interpret model information
## Warning: package 'insight' was built under R version 4.4.1
library(caret)      # Machine learning
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
pitching_df <- read.csv("MLB_pitching_stats_ALL_PITCHING.csv")
head(pitching_df)
dim(pitching_df)
## [1] 4233  108

pitching_df Preprocessing

Visualization

# Count the number of 0 values in each column
colSums(pitching_df == 0)
##                            X                         year 
##                            1                            0 
##                     playerId                   playerName 
##                            0                            0 
##                         type                         rank 
##                            0                            0 
##               playerFullName              playerFirstName 
##                            0                            0 
##               playerLastName                playerUseName 
##                            0                            0 
##           playerInitLastName                       teamId 
##                            0                            0 
##                   teamAbbrev                     teamName 
##                            0                            0 
##                teamShortName                   leagueName 
##                            0                            0 
##                     leagueId               positionAbbrev 
##                            0                            0 
##                     position        primaryPositionAbbrev 
##                            0                            0 
##            winningPercentage               runsScoredPer9 
##                            0                            0 
##                 battersFaced                        babip 
##                            0                            0 
##                          obp                          slg 
##                           49                           86 
##                          ops               strikeoutsPer9 
##                           49                            0 
##              baseOnBallsPer9                 homeRunsPer9 
##                            0                            0 
##                     hitsPer9           strikesoutsToWalks 
##                            0                            0 
##             inheritedRunners       inheritedRunnersScored 
##                         1651                         2168 
##            bequeathedRunners      bequeathedRunnersScored 
##                          809                         1565 
##                  stolenBases               caughtStealing 
##                         1439                         2411 
##                qualityStarts                gamesFinished 
##                         3192                         1238 
##                      doubles                      triples 
##                          535                         2550 
##                         gidp                      gidpOpp 
##                         1124                          191 
##                  wildPitches                        balks 
##                         1684                         3636 
##                     pickoffs                  totalSwings 
##                         3455                            1 
##               swingAndMisses                  ballsInPlay 
##                          166                            5 
##                   runSupport             strikePercentage 
##                          473                            0 
##             pitchesPerInning    pitchesPerPlateAppearance 
##                            0                            0 
##      walksPerPlateAppearance strikeoutsPerPlateAppearance 
##                          266                          274 
##   homeRunsPerPlateAppearance            walksPerStrikeout 
##                          629                            0 
##                          iso                      flyOuts 
##                          264                          189 
##                      popOuts                     lineOuts 
##                          582                          484 
##                   groundOuts                      flyHits 
##                          126                          485 
##                      popHits                     lineHits 
##                         3682                          246 
##                   groundHits                  gamesPlayed 
##                          384                            0 
##                 gamesStarted                      airOuts 
##                         2423                           75 
##                         runs                     homeRuns 
##                          225                          629 
##                   strikeOuts                  baseOnBalls 
##                          274                          266 
##             intentionalWalks                         hits 
##                         2837                           86 
##                   hitByPitch                          avg 
##                         1377                           86 
##                       atBats         stolenBasePercentage 
##                            0                            0 
##         groundIntoDoublePlay              numberOfPitches 
##                         1124                            0 
##                          era               inningsPitched 
##                            0                            5 
##                         wins                       losses 
##                         1597                         1472 
##                        saves            saveOpportunities 
##                         3245                         2677 
##                        holds                   blownSaves 
##                         2493                         2935 
##                   earnedRuns                         whip 
##                          244                            0 
##                         outs                 gamesPitched 
##                            5                            0 
##                completeGames                     shutouts 
##                         4092                         4147 
##                      strikes                   hitBatsmen 
##                            0                         1377 
##                   totalBases          groundOutsToAirouts 
##                           86                            0 
##                winPercentage           strikeoutWalkRatio 
##                            0                            0 
##            strikeoutsPer9Inn                 walksPer9Inn 
##                            0                            0 
##                  hitsPer9Inn         catchersInterference 
##                            0                         3908 
##                     sacBunts                     sacFlies 
##                         2890                         1842
par(mar = c(5, 10, 4, 2))  # Increase left margin to fit labels
barplot(colSums(pitching_df == 0)[colSums(pitching_df == 0) >= 2], las=2, horiz = T)

# Plot games played
ggplot(pitching_df, aes(x = gamesPlayed)) +
  geom_histogram(binwidth = 1, fill = "darkblue", color = "black")+
  geom_vline(xintercept = 5, color = "red", linetype = "dashed")

# Correct pitching_df types for some character columns
pitching_df <- pitching_df %>%
  mutate(across(c("winPercentage", "runsScoredPer9", "babip", "strikeoutsPer9", "baseOnBallsPer9", "homeRunsPer9", "hitsPer9", "strikesoutsToWalks", "pitchesPerInning", "walksPerStrikeout", "stolenBasePercentage", "era", "whip", "groundOutsToAirouts", "winPercentage", "winPercentage", "strikeoutWalkRatio", "strikeoutsPer9Inn", "walksPer9Inn", "hitsPer9Inn"), as.numeric))
## Warning: There were 18 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(...)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 17 remaining warnings.
pitching_df <- pitching_df %>% filter(year == 2024)
pitching_df <- pitching_df %>% filter(gamesPlayed > 5)

pitching_df$gidpOpp <- ifelse(pitching_df$battersFaced > 0, pitching_df$gidpOpp / pitching_df$battersFaced, NA)

pitching_df$swingAndMisses <- ifelse(pitching_df$battersFaced > 0, pitching_df$swingAndMisses / pitching_df$battersFaced, NA)

pitching_df$ballsInPlay <- ifelse(pitching_df$battersFaced > 0, pitching_df$ballsInPlay / pitching_df$battersFaced, NA)

# Define the columns to mark for deletion
columns_to_delete <- c("X",
  "playerId", "year", "playerName", "type", "rank", "playerFirstName",
  "playerLastName", "playerUseName", "playerInitLastName", "teamId",
  "runSupport", "teamAbbrev", "teamName", "leagueName", "leagueId",
  "position", "primaryPositionAbbrev", "ops",
  "strikesoutsToWalks", "stolenBases", "caughtStealing", "doubles",
  "triples", "gidp", "wildPitches", "balks", "pickoffs", "totalSwings",
  "pitchesPerInning", "walksPerStrikeout","lineOuts", "airOuts",
  "catchersInterference", "sacBunts", "sacFlies",
  "groundOuts", "strikeoutWalkRatio", "popOuts", 'lineOuts', 'groundOuts',
  "winningPercentage",
  "wins","losses", "atBats", "numberOfPitches",
  "gamesPitched", "intentionalWalks",
  "strikeoutsPer9Inn", "baseOnBallsPer9",
  "hitsPer9Inn","gidp",
  "hitBatsmen", "groundIntoDoublePlay", "gamesPlayed","stolenBasePercentage"
)
# Added : groundintodoubleplay, gamesplayed
# pitching_df Kept: "popHits", "lineHits","groundHits", "flyOuts","wins", "losses"

# Remove selected columns
pitching_df <- pitching_df %>%
  select(-one_of(columns_to_delete))
# Only keep numerical pitching_df
pitching_df.num <- pitching_df %>%
  select_if(is.numeric)

corr_matrix <- cor(pitching_df.num, use = "complete.obs")

corrplot(corr_matrix, 
         method = "color",
         col = colorRampPalette(c("blue", "white", "red"))(200), 
         type = "upper",
         tl.col = "black",
         tl.srt = 45,
         addCoef.col = "black",
         number.cex = 0.35,
         tl.cex = 0.5, 
         
         number.digits = 3 # Set precision for display
)

Find and print pairs with correlation 1

# Find and print pairs with correlation 1
correlated_columns <- which(corr_matrix == 1, arr.ind = TRUE)
print("Pairs with correlation 1:")
## [1] "Pairs with correlation 1:"
for (i in 1:nrow(correlated_columns)) {
  row <- correlated_columns[i, 1]
  col <- correlated_columns[i, 2]
  
  # Only print pairs where the row index is less than the column index to avoid duplicates
  if (row < col) {
    print(paste(colnames(cor_matrix)[row], colnames(cor_matrix)[col]))
  }
}

Handling missing values and outliers

# Replace NA values with the mean of the column
pitching_df.num <- pitching_df.num %>%
  mutate(across(everything(), ~ifelse(is.na(.), mean(., na.rm = TRUE), .)))

Show the distributions of all variables

data_long <- pitching_df.num %>% 
  pivot_longer(everything(), names_to = "variable", values_to = "value")

ggplot(data_long, aes(x = value)) +
  geom_density(fill = "blue", alpha = 0.4) +
  facet_wrap(~ variable, scales = "free") +
  labs(title = "Distributions of All Variables", x="", y="") +
  theme_minimal() +
  theme(
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    strip.text = element_text(size = 7)
)

ggsave("distributions_all.png", plot = last_plot(), width = 8, height = 6, dpi = 300)

# Box plot of all variables
scaled_data <- scale(sqrt(pitching_df.num))
boxplot(scaled_data, las=2)

ERA histrogram

p <- ggplot(pitching_df.num, aes(x = pitching_df$era)) +
      geom_histogram(bins = 50, fill = "darkblue", color = "black", alpha = 0.7) +
      # geom_boxplot(fill = "darkblue", color = "black", alpha = 0.7) +
      labs(title = paste("Histogram of era"), x = "Era", y = "Frequency") +
      theme_minimal()
    # Print the plot to display it
print(p)

ggsave("histogram_era.png", plot = last_plot(), width = 8, height = 6, dpi = 300)

pitching_df Analysis

Correlation of important variables

cor_matrix <- cor(pitching_df[, c("babip", "obp", "slg", "strikeoutsPer9", "homeRunsPer9", "hitsPer9", "era", "whip")])
corrplot(cor_matrix, method = "color", addCoef.col = "black", number.cex=0.7,col = colorRampPalette(c("blue", "white", "red"))(200))

# Correlations for starting pitchers
starter_pitching_df <- pitching_df[pitching_df$gamesStarted > 4, ]
cor_matrix_starters <- cor(starter_pitching_df[, c("era", "whip", "strikeoutsPer9", "homeRunsPer9", "hitsPer9", "baseOnBalls", "inningsPitched", "gamesStarted")])
corrplot(cor_matrix_starters, method = "color", tl.cex = 0.8, addCoef.col = "black", number.cex=0.7, col = colorRampPalette(c("blue", "white", "red"))(200))

# Correlations for relief pitchers
reliever_pitching_df <- pitching_df[pitching_df$gamesFinished > 4, ]
cor_matrix_relievers <- cor(reliever_pitching_df[, c("era", "whip", "strikeoutsPer9", "homeRunsPer9", "hitsPer9", "baseOnBalls", "gamesFinished", "saves", "saveOpportunities", "holds", "blownSaves")])
corrplot(cor_matrix_relievers, method = "color", tl.cex = 0.8, addCoef.col = "black", number.cex=0.7, col = colorRampPalette(c("blue", "white", "red"))(200))

Plots of important variables

ggplot_era <- ggplot(pitching_df, aes(x = era, y = whip)) +
  geom_point(aes(color = gamesStarted)) +
  labs(title = "ERA vs WHIP", x = "ERA", y = "WHIP") +
  theme_minimal() +
  scale_color_gradient(low = "blue", high = "red")
ggplot_era # Lower whip and era values correlate to lower values with the direct other variable

ggsave("era_whip.png", plot = last_plot(), width = 8, height = 6, dpi = 300)

ggplot_strikouts <- ggplot(pitching_df, aes(x = strikeoutsPer9, y = walksPer9Inn)) +
  geom_point(aes(color = battersFaced)) +
  labs(title = "Strikeouts per 9 vs Home Runs per 9", x = "Strikeouts per 9", y = "Home Runs per 9") +
  theme_minimal() +
  scale_color_gradient(low = "blue", high = "red")
ggplot_strikouts# Values seem uncorrelated

ggsave("strikeouts_walks.png", plot = last_plot(), width = 8, height = 6, dpi = 300)
# Plot data for Starting pitchers
ggplot(starter_pitching_df, aes(x = battersFaced, y = era)) +
  geom_point(aes(color = battersFaced)) +
  labs(title = "Strikeouts Over Time", x = "Batters Faced", y = "Strike Outs") +
  theme_minimal() +
  scale_color_gradient(low = "blue", high = "red")

# plot data for Relievers
ggplot(reliever_pitching_df, aes(x = battersFaced, y = era)) +
  geom_point(aes(color = era)) +
  labs(title = "Strikeouts Over Time", x = "Batters Faced", y = "Strike Outs") +
  theme_minimal() +
  scale_color_gradient(low = "blue", high = "red")

# Assuming you have two previously created data frames : starter_pitching_df and reliever_pitching_df

# Add a new column to label the data as "Starter" or "Reliever"
starter_pitching_df$Position <- "Starter"
reliever_pitching_df$Position <- "Reliever"

# Combine the two data frames into one
combined_pitching_df <- rbind(starter_pitching_df, reliever_pitching_df)

# Now, plot both data sets with two boxplots side by side
box_plot_era <-ggplot(combined_pitching_df, aes(x = Position, y = era, fill = Position)) +
  geom_boxplot() +
  scale_fill_viridis(discrete = TRUE, alpha = 0.6) +
  geom_jitter(color = "black", size = 0.4, alpha = 0.9) +
  theme_ipsum() +
  theme(
    legend.position = "none",
    plot.title = element_text(size = 11),
    axis.text.x = element_text(angle = 0, hjust = 0.5)  # Adjust x-axis labels for readability
  ) +
  coord_flip() +
  ggtitle("Boxplot of ERA for Starters vs Relievers") +
  xlab("Pitcher Type (Starter vs Reliever)")
box_plot_era

ggsave("boxplot_era.png", plot = last_plot(), width = 10, height = 6, dpi = 300)

PCA

Apply PCA and visualize it

pca <- prcomp(pitching_df.num, scale = T)

data.frame(z1=-pca$x[,1],z2=pca$x[,2]) %>% 
  ggplot(aes(z1,z2,label=pitching_df$playerFullName, color=as.numeric(pitching_df$battersFaced)))+
  geom_point(size=0) +
  labs(title="PCA", x="PC1", y="PC2", color="Batters Faced") +
  theme_bw() +
  ylim(-5,5) +
  geom_text(size=2, hjust=0.6, vjust=0, check_overlap = TRUE)+
  scale_color_gradient(low = "blue", high = "red")
## Warning: Removed 71 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 71 rows containing missing values or values outside the scale range
## (`geom_text()`).

# Dimension 1 is starting pitchers
ggplot(data.frame(variable = names(pca$rotation[, 1]), loading = pca$rotation[, 1]), 
       aes(x = reorder(variable, loading), y = loading)) +
  geom_bar(stat = "identity", fill = "darkblue") +
  coord_flip() +
  theme_minimal() +
  labs(x = "Variable", y = "Loading on PC1", title = "Loadings for Principal Component 1") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Optional: rotate axis labels

# Dimension 2 is relief pitchers/closers
ggplot(data.frame(variable = names(pca$rotation[, 2]), loading = pca$rotation[, 2]), 
       aes(x = reorder(variable, loading), y = loading)) +
  geom_bar(stat = "identity", fill = "darkblue") +
  coord_flip() + 
  theme_minimal() +
  labs(x = "Variable", y = "Loading on PC2", title = "Loadings for Principal Component 2") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Optional: rotate axis labels

fviz_contrib(pca, choice = "var", axes = 1)

fviz_contrib(pca, choice = "var", axes = 2)

fviz_pca_var(pca, col.var = "contrib")

fviz_pca_biplot(pca, repel = TRUE, col.var = "contrib", geom = "point",  pointshape = 21, pointsize = 2, label = "all")

Plot of the proportion of variance explained by each principal component including the cumulative proportion

# Calculate proportion of variance explained
var_explained = pca$sdev^2 / sum(pca$sdev^2)
# Create a data frame with the results
summary_df = data.frame(
  PC = paste0("PC", 1:length(var_explained)),
  Proportion = var_explained,
  Cumulative = cumsum(var_explained)
)
# Plot the proportion of variance explained and the cummulative proportion
ggplot(head(summary_df, 9), aes(x = PC, y = Proportion)) +
  geom_col(fill = "steelblue") +
  geom_line(aes(y = Proportion), group = 1, color = "black") +
  geom_point(aes(y = Proportion), color = "black") +
  geom_line(aes(y = Cumulative), group = 1, color = "red") +
  geom_text(aes(y = Cumulative, label = scales::percent(Cumulative, accuracy = 0.1)), 
            vjust = -0.5, size = 3) +  # Adds text for line peaks
  geom_point(aes(y = Cumulative), color = "red") +
  geom_text(aes(label = scales::percent(Proportion, accuracy = 0.1)), 
            vjust = -0.5, size = 3) +  # Adds text for bar peaks
  
  scale_y_continuous(labels = scales::percent,
                     sec.axis = sec_axis(~., name = "Cumulative Proportion")) +
  labs(title = "Variance Explained by Principal Components",
       x = "Principal Component",
       y = "Proportion of Variance Explained") +
  theme_minimal()

Find optimal loadings

Print the top 10 players

pitching_df$playerFullName[order(pca$x[,1])][(nrow(pitching_df)-5):nrow(pitching_df)]
## [1] "Nick Burdi"    "Zach Penrod"   "Riley O'Brien" "Will Klein"   
## [5] "Amir Garrett"  "Jairo Iriarte"
pitching_df$playerFullName[order(pca$x[,1])][1:10]
##  [1] "Aaron Nola"      "Kutter Crawford" "Seth Lugo"       "Luis Severino"  
##  [5] "Logan Webb"      "José Berríos"    "Kevin Gausman"   "Miles Mikolas"  
##  [9] "Patrick Corbin"  "JP Sears"

Show the number of components to keep using Kaiser’s Criterion

We can use Kaiser’s Criterion that says the loadings that could be kept have a eigen value > 1 such that the component explains more variance than a single original variable in the data set

eigenvalues <- pca$sdev^2
components_to_keep <- sum(eigenvalues > 1)
components_to_keep 
## [1] 8

8 eigen values can be kept as they have an eigen value > 1

Factor Analysis

Direct Factor Analysis did not work so we had to increase the tolerance for factor analysis, this is likely because this is a very noisy data set. We did it by removing the highly correlated variables

ev <- eigen(cor(pitching_df.num))
nS <- nScree(x=ev$values)
plotnScree(nS,legend = T)

print(ev$values)
##  [1] 1.718456e+01 9.832768e+00 4.305317e+00 3.596905e+00 3.048949e+00
##  [6] 2.152778e+00 2.036800e+00 1.572300e+00 9.647913e-01 8.749767e-01
## [11] 8.506455e-01 7.823639e-01 6.557721e-01 5.964013e-01 5.172547e-01
## [16] 4.599051e-01 3.602745e-01 3.038632e-01 2.748158e-01 2.286877e-01
## [21] 2.241795e-01 1.678282e-01 1.498028e-01 1.301776e-01 1.237122e-01
## [26] 1.082414e-01 9.206531e-02 7.848734e-02 6.691190e-02 5.878771e-02
## [31] 4.021497e-02 3.058856e-02 2.748036e-02 2.130271e-02 1.919845e-02
## [36] 1.895951e-02 1.533066e-02 6.572728e-03 6.351744e-03 4.795298e-03
## [41] 2.620511e-03 1.944970e-03 1.854508e-03 8.882215e-04 7.895135e-04
## [46] 6.436148e-04 7.529294e-05 4.501264e-05 1.688850e-05 6.487918e-06
## [51] 1.426479e-14 8.167849e-16
nzv <- nearZeroVar(pitching_df.num, saveMetrics = TRUE)
nzv
pitching_df_reduced <- pitching_df.num[, !nzv$nzv]

# Scale the pitching_df
pitching_df_scaled <- scale(pitching_df_reduced)

# Remove highly correlated variables with a cutoff of 0.95
cor_matrix <- cor(pitching_df_scaled)
high_cor <- findCorrelation(cor_matrix, cutoff = 0.95)
pitching_df_reduced <- pitching_df_scaled[, -high_cor]

fit <- factanal(pitching_df_reduced, factors = 3, rotation = "varimax", n.obs = nrow(pitching_df.num) * 1.5)
fit
## 
## Call:
## factanal(x = pitching_df_reduced, factors = 3, n.obs = nrow(pitching_df.num) *     1.5, rotation = "varimax")
## 
## Uniquenesses:
##                     babip                       obp                       slg 
##                     0.643                     0.472                     0.005 
##            strikeoutsPer9              homeRunsPer9                  hitsPer9 
##                     0.890                     0.363                     0.261 
##          inheritedRunners    inheritedRunnersScored         bequeathedRunners 
##                     0.102                     0.194                     0.422 
##   bequeathedRunnersScored             qualityStarts             gamesFinished 
##                     0.629                     0.175                     0.729 
##                   gidpOpp            swingAndMisses               ballsInPlay 
##                     0.858                     0.853                     0.779 
##          strikePercentage pitchesPerPlateAppearance                       iso 
##                     0.984                     0.949                     0.203 
##                   flyOuts                   flyHits                   popHits 
##                     0.110                     0.120                     0.908 
##                  lineHits                groundHits              gamesStarted 
##                     0.061                     0.142                     0.038 
##                      runs               baseOnBalls                hitByPitch 
##                     0.044                     0.223                     0.642 
##                       era         saveOpportunities                     holds 
##                     0.316                     0.845                     0.516 
##                blownSaves       groundOutsToAirouts             winPercentage 
##                     0.577                     0.980                     0.885 
##              walksPer9Inn 
##                     0.928 
## 
## Loadings:
##                           Factor1 Factor2 Factor3
## babip                              0.596         
## obp                       -0.168   0.707         
## slg                                0.991  -0.117 
## strikeoutsPer9                    -0.287   0.163 
## homeRunsPer9                       0.789  -0.117 
## hitsPer9                           0.853         
## inheritedRunners                  -0.108   0.938 
## inheritedRunnersScored                     0.895 
## bequeathedRunners          0.629           0.427 
## bequeathedRunnersScored    0.502           0.344 
## qualityStarts              0.801          -0.421 
## gamesFinished             -0.168  -0.176   0.460 
## gidpOpp                            0.357         
## swingAndMisses                    -0.358   0.133 
## ballsInPlay                0.121   0.421  -0.169 
## strikePercentage           0.115                 
## pitchesPerPlateAppearance         -0.219         
## iso                                0.883  -0.133 
## flyOuts                    0.923          -0.192 
## flyHits                    0.909   0.156  -0.174 
## popHits                    0.304                 
## lineHits                   0.951          -0.177 
## groundHits                 0.920          -0.103 
## gamesStarted               0.862          -0.468 
## runs                       0.956   0.152  -0.139 
## baseOnBalls                0.877                 
## hitByPitch                 0.596                 
## era                       -0.146   0.808         
## saveOpportunities                 -0.210   0.322 
## holds                             -0.193   0.665 
## blownSaves                        -0.127   0.634 
## groundOutsToAirouts                        0.140 
## winPercentage                     -0.331         
## walksPer9Inn              -0.260                 
## 
##                Factor1 Factor2 Factor3
## SS loadings      7.807   5.501   3.843
## Proportion Var   0.230   0.162   0.113
## Cumulative Var   0.230   0.391   0.504
## 
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 16769.81 on 462 degrees of freedom.
## The p-value is 0

Factor Analysis with PCA

We used the first 10 components from the PCA to perform factor analysis

fa_pca <- prcomp(pitching_df.num, scale = T)
summary(fa_pca)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5    PC6     PC7
## Standard deviation     4.1454 3.1357 2.07493 1.89655 1.74612 1.4672 1.42717
## Proportion of Variance 0.3305 0.1891 0.08279 0.06917 0.05863 0.0414 0.03917
## Cumulative Proportion  0.3305 0.5196 0.60236 0.67153 0.73016 0.7716 0.81073
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     1.25391 0.98224 0.93540 0.92230 0.88451 0.80980 0.77227
## Proportion of Variance 0.03024 0.01855 0.01683 0.01636 0.01505 0.01261 0.01147
## Cumulative Proportion  0.84097 0.85952 0.87635 0.89271 0.90775 0.92036 0.93183
##                           PC15    PC16    PC17    PC18    PC19   PC20    PC21
## Standard deviation     0.71920 0.67816 0.60023 0.55124 0.52423 0.4782 0.47348
## Proportion of Variance 0.00995 0.00884 0.00693 0.00584 0.00528 0.0044 0.00431
## Cumulative Proportion  0.94178 0.95062 0.95755 0.96340 0.96868 0.9731 0.97739
##                           PC22    PC23   PC24    PC25    PC26    PC27    PC28
## Standard deviation     0.40967 0.38704 0.3608 0.35173 0.32900 0.30342 0.28016
## Proportion of Variance 0.00323 0.00288 0.0025 0.00238 0.00208 0.00177 0.00151
## Cumulative Proportion  0.98062 0.98350 0.9860 0.98838 0.99046 0.99223 0.99374
##                           PC29    PC30    PC31    PC32    PC33    PC34    PC35
## Standard deviation     0.25867 0.24246 0.20054 0.17490 0.16577 0.14595 0.13856
## Proportion of Variance 0.00129 0.00113 0.00077 0.00059 0.00053 0.00041 0.00037
## Cumulative Proportion  0.99503 0.99616 0.99693 0.99752 0.99805 0.99846 0.99883
##                           PC36    PC37    PC38    PC39    PC40    PC41    PC42
## Standard deviation     0.13769 0.12382 0.08107 0.07970 0.06925 0.05119 0.04410
## Proportion of Variance 0.00036 0.00029 0.00013 0.00012 0.00009 0.00005 0.00004
## Cumulative Proportion  0.99919 0.99949 0.99961 0.99974 0.99983 0.99988 0.99992
##                           PC43    PC44    PC45    PC46     PC47     PC48
## Standard deviation     0.04306 0.02980 0.02810 0.02537 0.008677 0.006709
## Proportion of Variance 0.00004 0.00002 0.00002 0.00001 0.000000 0.000000
## Cumulative Proportion  0.99995 0.99997 0.99998 1.00000 1.000000 1.000000
##                           PC49     PC50      PC51      PC52
## Standard deviation     0.00411 0.002547 9.903e-16 3.026e-16
## Proportion of Variance 0.00000 0.000000 0.000e+00 0.000e+00
## Cumulative Proportion  1.00000 1.000000 1.000e+00 1.000e+00
# Pick the first 10 components
compnets_for_fa <- fa_pca$x[, 1:10]
fa <- factanal(compnets_for_fa, factors = 5, rotation = "varimax")
fa
## 
## Call:
## factanal(x = compnets_for_fa, factors = 5, rotation = "varimax")
## 
## Uniquenesses:
##  PC1  PC2  PC3  PC4  PC5  PC6  PC7  PC8  PC9 PC10 
## 1.00 0.75 0.75 1.00 1.00 0.75 0.75 1.00 1.00 0.75 
## 
## Loadings:
##      Factor1 Factor2 Factor3 Factor4 Factor5
## PC1                                         
## PC2                                   0.490 
## PC3                           0.492         
## PC4                                         
## PC5                                         
## PC6           0.492                         
## PC7                   0.486                 
## PC8                                         
## PC9                                         
## PC10  0.499                                 
## 
##                Factor1 Factor2 Factor3 Factor4 Factor5
## SS loadings      0.250   0.250   0.250   0.250   0.250
## Proportion Var   0.025   0.025   0.025   0.025   0.025
## Cumulative Var   0.025   0.050   0.075   0.100   0.125
## 
## Test of the hypothesis that 5 factors are sufficient.
## The chi square statistic is 0 on 5 degrees of freedom.
## The p-value is 1
# Scree plot to determine number of factors using Kaiser rule
eigenvalues <- fa_pca$sdev^2

plot(eigenvalues, type = "b", 
     # Smaller circle
     pch = 0,
     xlab = "Principal Component", 
     ylab = "Eigenvalue", 
     main = "Scree Plot")
abline(h = 1, col = "red", lty = 2) # Kaiser rule threshold line at eigenvalue = 1

Graph the loadings

loadings_df <- data.frame(Variable = rownames(fa$loadings))

for(i in 1:ncol(fa$loadings)) {
    loadings_df[paste0("Factor ", i)] <- fa$loadings[,i]
}

loadings_long <- melt(loadings_df, 
                     id.vars = "Variable",
                     variable.name = "Factor",
                     value.name = "Loading")

ggplot(loadings_long, aes(x = Variable, y = Loading)) +
  geom_col(fill = "lightgreen", width = 0.7) +
  facet_grid(Factor~.) +
  geom_text(aes(label = round(Loading, 2)), 
            color = "black",
            vjust = 1.3,
            size = 2.5) +
  labs(title = "Factor Loadings",
       x = "Component",
       y = "Loading") +
  theme_minimal() +
  theme(
    panel.grid = element_blank(),
    axis.text.y = element_blank()
)

Clustering

KMEANS

# First how much centers should we have 
fviz_nbclust(pitching_df.num, kmeans, method = 'silhouette', k.max = 10) # -> shows 2 clusters is optimal

k = 3 # -> I chose 3 anyway
fit =  kmeans(scale(pitching_df.num), k, nstart=100)
groups = fit$cluster
# How many players are in each cluster
barplot(table(groups), col="darkblue")

# Create unique labels by appending an index to each duplicate name
unique_labels <- make.unique(as.character(pitching_df$playerFullName))
rownames(pitching_df.num) <- unique_labels
fviz_cluster(fit, pitching_df.num, geom = "text", labelsize=8, main = "Clusters with K-means")

# More readable plot
fviz_cluster(fit, data = pitching_df.num, geom = "point", label = "none", main = "Clusters with K-means") + geom_text(label = unique_labels, check_overlap = T, size = 3)

Interpretation of centers:

Plot of the contribution of each variable to each cluster

centers=fit$centers
tidy = cbind(
  gather(as.data.frame(t(centers)), "cluster", "coor"),
  var=rep(colnames(centers, k)),
  size=rep(table(fit$cluster), each=ncol(centers))
  )

tidy %>%
  ggplot(aes(x=cluster, y=coor, fill=cluster)) +
  geom_col() +
  facet_wrap(~var) +
  geom_text(aes(label=size),position=position_stack(1.2))

Different plot also showing how each variable contributes to the clusters

centers=fit$centers
tidy = cbind(
  gather(as.data.frame(t(centers)), "cluster", "coor"),
  var=rep(colnames(centers, k)),
  size=rep(table(fit$cluster), each=ncol(centers)) 
  )

tidy %>%
  ggplot(aes(x=var, y=coor, fill=var)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~cluster) +
  theme(axis.text.x = element_text(angle=45),legend.position = "none")

Showing clusters individually

# Use the colors as in fviz_cluster
colors <- c("#fc7671", "#00bb46", "#5d9bfa")
groups = fit$cluster
lapply(1:3, function(cluster_id) {
  # Subset data
  subset_data <- pitching_df.num[groups == cluster_id, ]

  # Re-run clustering and plot
  fit_subset <- kmeans(scale(subset_data), centers = 1, nstart = 50)
    p2 <- fviz_cluster(fit_subset, 
                  data = subset_data, 
                  geom = c("point", "text"),
                  labelsize = 6,
                  repel = F,
                  ellipse.type = "convex",
                  ggtheme = theme_minimal()) +
                  scale_color_manual(values = colors[cluster_id]) +
                  theme(legend.position = "none") +
                  ggtitle(paste("Cluster", cluster_id))
})
## [[1]]

## 
## [[2]]

## 
## [[3]]

A silhouette plot to show how well the clusters are separated Eclust scalss the data automatically for us

fit.kmeans = eclust(pitching_df.num, "kmeans", k=3, stand=TRUE, nstart=100, graph = FALSE)
fviz_silhouette(fit.kmeans)
##   cluster size ave.sil.width
## 1       1  137          0.36
## 2       2  207          0.19
## 3       3  304          0.13

Choosing the number of clusters for kmeans

WSS method Silhouette method Gap statistic method

# WSS
fviz_nbclust(scale(pitching_df.num), kmeans, method = "wss", k.max = 20, nstart = 200, iter.max = 100, nboot = 100)

# Silhouette
fviz_nbclust(scale(pitching_df.num), kmeans, method = "silhouette", k.max = 20, nstart = 1000, iter.max = 100)

# Gap statistic
fviz_nbclust(scale(pitching_df.num), kmeans, method = 'gap_stat', k.max = 7, nstart = 20, nboot = 100, iter.max = 500)

Kmeans of PCA

Does the # of clusters correlate to without pca?

No the optimal still says 2

WSS method

cluster_pca <- prcomp(scale(pitching_df.num), scale = T)

my_pca_pitching_df <- data.frame(cluster_pca$x[, 1:2])

fviz_nbclust(my_pca_pitching_df, kmeans, method = 'silhouette', k.max = 10)

my_kmeans <- kmeans(my_pca_pitching_df, centers = 3)

rownames(my_pca_pitching_df) <- unique_labels
fviz_cluster(my_kmeans, data = my_pca_pitching_df, geom = "text", labelsize=8)

centers=my_kmeans$centers
tidy = cbind(
  gather(as.data.frame(t(centers)), "cluster", "coor"),
  var=rep(colnames(centers, k)),
  size=rep(table(my_kmeans$cluster), each=ncol(centers))
)

tidy %>%
  ggplot(aes(x=cluster, y=coor, fill=cluster)) +
  geom_col() +
  facet_wrap(~var) +
  geom_text(aes(label=size),position=position_stack(1.2))

centers=my_kmeans$centers
tidy = cbind(
  gather(as.data.frame(t(centers)), "cluster", "coor"),
  var=rep(colnames(centers, k)),
  size=rep(table(my_kmeans$cluster), each=ncol(centers))
  )

tidy %>%
  ggplot(aes(x=var, y=coor, fill=var)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~cluster) +
  theme(axis.text.x = element_text(angle=45),legend.position = "none")

PAM clustering

fit.pam = eclust(pitching_df.num, FUNcluster="pam", stand = TRUE, k=3,
                 graph = T, nstart=1000)

fviz_cluster(fit.pam, geom="point", main="Clusters with PAM", ellipse.type = "norm")+
geom_text(label=pitching_df$playerFullName, check_overlap = T)

Number of clusters for PAM

# WSS
fviz_nbclust(scale(pitching_df.num), pam, method="wss", k.max=10)

# Silhouette
fviz_nbclust(scale(pitching_df.num), pam, method="silhouette", k.max=10)

# Gap statistic
fviz_nbclust(scale(pitching_df.num), pam, method="gap_stat", k.max=10)

Adjusted rand index

adjustedRandIndex(fit.kmeans$cluster, fit.pam$clustering)
## [1] 0.7661753

Kernel Estimation

create_kde_plot <- function(pitching_df, col_name) {
  p <- ggplot(pitching_df, aes_string(x = col_name)) +
    geom_density(fill = "darkblue", color = "black", alpha = 0.7) +
    labs(title = paste("Kernel Density Estimation of", col_name), 
         x = col_name, y = "Density") +
    theme_minimal()
  
  print(p)
}

for (col_name in names(pitching_df.num)) {
  if (is.numeric(pitching_df.num[[col_name]])) {
    create_kde_plot(pitching_df.num, col_name)
  }
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.